{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "26e0d51a",
   "metadata": {},
   "source": [
    "# Chapter 6 - Organisation Matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fed13bcb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "import zipfile\n",
    "import io\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "450849af",
   "metadata": {},
   "source": [
    "## Step 1 - Data Acquisition"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93a2fb88",
   "metadata": {},
   "source": [
    "### Companies House Basic Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "658d9ef7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# UK Companies House Basic Company Data download page\n",
    "\n",
    "url=\"http://download.companieshouse.gov.uk/en_output.html\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15b1697a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe\n",
    "\n",
    "df_ct = pd.DataFrame()\n",
    "with requests.Session() as req:\n",
    "        r = req.get(url)\n",
    "        soup = BeautifulSoup(r.content, 'html.parser')\n",
    "        snapshots = [f\"{url[:38]}{item['href']}\" for item in soup.select(\n",
    "            \"a[href*='BasicCompanyData-']\")]\n",
    "        for snapshot in snapshots:    \n",
    "            response = requests.get(snapshot).content     \n",
    "            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))\n",
    "            tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])\n",
    "            print(zipsnapshot.namelist()[0])\n",
    "            df_c = pd.read_csv(tempfile, dtype='unicode')\n",
    "            df_c = df_c[['RegAddress.PostCode','CompanyName']]\n",
    "            df_ct = pd.concat([df_ct, df_c], ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "329143e3",
   "metadata": {},
   "source": [
    "### Maritime and Coastguard Agency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "552dde1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Maritime and Coastguard Agency - List of approved recruitment and placement agencies\n",
    "\n",
    "url = \"https://www.gov.uk/government/publications/recruitment-and-placement-agencies-approved-by-the-mca\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5fcc1b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download the csv file, extract name and drop erroroneous last row with all nulls\n",
    "\n",
    "with requests.Session() as req:\n",
    "    r = req.get(url)\n",
    "    soup = BeautifulSoup(r.content, 'html.parser')\n",
    "    targets = [f\"{item['href']}\" for item in soup.select(\n",
    "        \"a[href$='.csv']\")]\n",
    "    for target in targets:\n",
    "        response = req.get(target)    \n",
    "        df_m = pd.read_csv(io.BytesIO(response.content))\n",
    "        df_m = df_m.dropna(how='all')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8d329063",
   "metadata": {},
   "source": [
    "### Saving to Local Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df874281",
   "metadata": {},
   "outputs": [],
   "source": [
    "#df_ct.to_csv('basic_raw.csv', index=False)\n",
    "df_c = pd.read_csv('basic_raw.csv')\n",
    "\n",
    "#df_m.to_csv('mari_raw.csv', index=False)\n",
    "df_m = pd.read_csv('mari_raw.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5631850",
   "metadata": {},
   "source": [
    "## Step 2 - Data Standardization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae22d609",
   "metadata": {},
   "outputs": [],
   "source": [
    "#%pip install matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa9763e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.hist(df_m.apply(lambda row: len(row['ADDRESS & CONTACT DETAILS'].split(',')), axis=1).tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9810d97",
   "metadata": {},
   "source": [
    "### Companies House Basic Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25b0a91b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to separate stopwards from company names\n",
    "\n",
    "def strip_stopwords(raw_name):    \n",
    "    company_stopwords = { 'LIMITED', 'LTD', 'SERVICES', 'COMPANY', 'GROUP', 'PROPERTIES', 'CONSULTING', \n",
    "        'HOLDINGS', 'UK', 'TRADING', 'LTD.', 'PLC','LLP' }\n",
    "    name_without_stopwords = []\n",
    "    stopwords = []\n",
    "    for raw_name_part in raw_name.split():\n",
    "        if raw_name_part in company_stopwords:\n",
    "            stopwords.append(raw_name_part)\n",
    "        else:\n",
    "            name_without_stopwords.append(raw_name_part)\n",
    "    return(' '.join(name_without_stopwords), ' '.join(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3065d5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Strip company name and rename postcode column\n",
    "\n",
    "df_c[['CompanyName','Stopwords']] = pd.DataFrame(zip(*df_c['CompanyName'].apply(strip_stopwords))).T\n",
    "df_c = df_c.rename(columns={\"RegAddress.PostCode\": \"Postcode\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5872d6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unnecessary columns and add index\n",
    "\n",
    "df_c = df_c[['Postcode','CompanyName','Stopwords']]\n",
    "df_c['unique_id'] = df_c.index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39769f33",
   "metadata": {},
   "source": [
    "### Maritime and Coastguard Agency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b605c10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to extract postcode using regular expression\n",
    "\n",
    "import re\n",
    "def extract_postcode(address):\n",
    "    pattern = re.compile(r'([A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2})')\n",
    "    postcode = pattern.search(address)\n",
    "    if(postcode is not None):\n",
    "         return postcode.group()\n",
    "    else:\n",
    "         return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0583087",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Company name to uppercase\n",
    "\n",
    "df_m['CompanyName'] = df_m['COMPANY'].str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11b351b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Strip company name and extract postcode\n",
    "\n",
    "df_m[['CompanyName','Stopwords']] = pd.DataFrame(zip(*df_m['CompanyName'].apply(strip_stopwords))).T\n",
    "df_m['Postcode'] = df_m.apply(lambda row: extract_postcode(row['ADDRESS & CONTACT DETAILS']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e81408b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unnecessary columns, drop rows with missing data and add index\n",
    "\n",
    "df_m = df_m[['Postcode','CompanyName','Stopwords']]\n",
    "df_m = df_m.dropna()\n",
    "df_m['unique_id'] = df_m.index\n",
    "\n",
    "len(df_m)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64836253",
   "metadata": {},
   "source": [
    "### Saving to Local Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16f47da7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#df_c.to_csv('basic_clean.csv', index=False)\n",
    "df_c = pd.read_csv('basic_clean.csv')\n",
    "\n",
    "#df_m.to_csv('mari_clean.csv', index=False)\n",
    "df_m = pd.read_csv('mari_clean.csv')\n",
    "len(df_m)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b459699",
   "metadata": {},
   "source": [
    "# Step 3 - Record Blocking and Attribute Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23af6b30",
   "metadata": {},
   "outputs": [],
   "source": [
    "import splink"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9218066e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predict only on exact company name or postcode match\n",
    "\n",
    "from splink.duckdb.linker import DuckDBLinker\n",
    "from splink.duckdb import comparison_library as cl\n",
    "settings = {\n",
    "    \"link_type\": \"link_only\",\n",
    "    \"blocking_rules_to_generate_predictions\": [\n",
    "        \"l.Postcode = r.Postcode\",\n",
    "        \"l.CompanyName = r.CompanyName\",\n",
    "    ],\n",
    "    \"comparisons\": [\n",
    "        cl.jaro_winkler_at_thresholds(\"CompanyName\",[0.9,0.8]),\n",
    "        cl.jaro_winkler_at_thresholds(\"Stopwords\",[0.9]),\n",
    "    ],\n",
    "    \"retain_intermediate_calculation_columns\" : True,\n",
    "    \"retain_matching_columns\" : True\n",
    "}\n",
    "linker = DuckDBLinker([df_m, df_c], settings, input_table_aliases=[\"_m\", \"_c\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db0a60af",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.cumulative_num_comparisons_from_blocking_rules_chart()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2a68c5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use 50 million targets rows to ensure u estimation\n",
    "\n",
    "linker.estimate_u_using_random_sampling(max_pairs=5e7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f30017d",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.estimate_parameters_using_expectation_maximisation(\"l.Postcode = r.Postcode\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43e03803",
   "metadata": {},
   "outputs": [],
   "source": [
    "#linker.save_model_to_json(\"Chapter6_Splink_Settings.json\", overwrite=True)\n",
    "linker.load_settings(\"Chapter6_Splink_Settings.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8531f92c",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.match_weights_chart()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d858639",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.m_u_parameters_chart()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b558cec6",
   "metadata": {},
   "source": [
    "# Step 4 - Match Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56495421",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate predictions\n",
    "\n",
    "df_pred = linker.predict(threshold_match_probability=0.05).as_pandas_dataframe()\n",
    "len(df_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5427dfbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(pd.unique(df_pred['CompanyName_r']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a31fafaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "postname = df_pred[(df_pred['CompanyName_l']==df_pred['CompanyName_r']) & (df_pred['Postcode_l']==df_pred['Postcode_r'])]\n",
    "len(postname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae036de9",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(pd.unique(postname['CompanyName_r']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c20fa0fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "notname = df_pred[df_pred['CompanyName_l']!=df_pred['CompanyName_r']]\n",
    "len(notname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81dceb42",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(pd.unique(notname['CompanyName_r']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "290476a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "notpost = df_pred[df_pred['Postcode_l']!=df_pred['Postcode_r']]\n",
    "len(notpost)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abb952f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(pd.unique(notpost['CompanyName_r']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3160a718",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = df_m.merge(df_pred,left_on=['unique_id'], right_on=['unique_id_r'],how='left',\n",
    "          suffixes=('_m', '_p'))\n",
    "results[results['match_weight'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "992fb681",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.waterfall_chart(df_pred.to_dict(orient=\"records\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e097b305",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pred.head(n=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9bdccff6",
   "metadata": {},
   "source": [
    "# Step 5 - Resolve New Entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e957c7b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "record = {    \n",
    " 'unique_id': 1,\n",
    " 'Postcode': \"BH15 4QE\",\n",
    " 'CompanyName': \"VANTAGE YACHT RECRUITMENT\",\n",
    " 'Stopwords' : \"\"\n",
    "}\n",
    "\n",
    "df_new = linker.find_matches_to_new_records([record], match_weight_threshold=0).as_pandas_dataframe()\n",
    "df_new.sort_values(\"match_weight\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26ef9e78",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "handsonentityresolution",
   "language": "python",
   "name": "handsonentityresolution"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
